### ---- SCRIPT 01: The purpose of this script is to pre-process the Twitter/X user data extracted from ---- ###
### ---- https://doi.org/10.7910/DVN/JDB0SE. It normalises the original left-right ideal points onto a  ---- ###
### ---- 0-1 scale, categorises accounts as ordinary/elite, then generates a plot comapring the two     ---- ###
### ---- distributions. Finally, it joins guests with their ideal points and the ideal points of their  ---- ###
### ---- affiliated organisations.                                                                      ---- ###

# Suppress scientific notation
options(scipen=999)

#### ---- LIBRARIES ---- ####

library(dplyr) # data wrangling
library(ggplot2) # data visualisation
library(data.table) # easier to work with large datasets

#### ---- DIRECTORIES ---- ####

raw_dir <- "00-raw_data/"
processed_dir <- "01-processed_data/"
figure_dir <- "02-figures/"

#### ---- LOAD DATA ---- ####

# Guest attributes
username_list <- fread(paste0(raw_dir,"username_master_list.csv"))

# Organisation attributes
organisation_list <- fread(paste0(raw_dir,"organisation_master_list.csv"))

# Twitter/X user data (redacted)
twitter_user_data <- fread(paste0(raw_dir,"user_ideal_point_data.csv"))

#### ---- PREPROCESSING ---- ####

# In the Twitter user data, normalise the ideal points using z-scaling
twitter_user_data$user_ideal_point <- scale(twitter_user_data$svd.phi_1, center = TRUE, scale = TRUE)

# Also, categorise the data into two groups: ordinary accounts and elite accounts. This based on two conditions:
# verified and/or has at least 30,000 followers.
twitter_user_data <- twitter_user_data %>% 
  mutate(elite_account = ifelse(verified == "TRUE" | followers_count >= 30000, "Elite Account", "Ordinary Account"))

# Plot distributions of elite and ordinary users
user_data_ideal_compare <- ggplot(twitter_user_data, aes(x = user_ideal_point, fill = factor(elite_account))) +
  geom_density(alpha = 0.7, bw = "sj") +
  geom_boxplot(width = 0.1, show.legend = FALSE) +
  labs(x = "Left-Right Ideological Position", 
       fill = "Account Type") +  
  theme_minimal() +  
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    legend.position = "top") +
  geom_vline(xintercept = 0, linetype = "dashed") +
  scale_fill_manual(values = c("grey30", "grey80"))

ggsave(paste0(figure_dir,"ordinary_vs_user_ideal_comparison.png"),
       user_data_ideal_compare,
       units="in", width=7, height=4, dpi=300,
       bg="white")

fwrite(twitter_user_data, paste0(processed_dir,"user_ideal_point_data_updated.csv"))

# Merge the guest and organisation data to their respective Twitter user data via their usernames
username_list <- left_join(username_list,twitter_user_data,by="username",suffix = c("","_twitter"))

organisation_list <- left_join(organisation_list,twitter_user_data,by="username") %>%
  rename(organisation_ideal_point = user_ideal_point)

# Calculate an additional set of ideal points for organisations that uses the mean ideal points of all 
# accounts affiliated with them. This is ascertained by including all elite accounts that @ mention the 
# organisation in their Twitter bio. 
elite_users <- twitter_user_data %>% filter(elite_account == "Elite Account")

organisation_list$username_match <- ifelse(organisation_list$username != "", paste("@", organisation_list$username, sep = ""), NA)
  
affiliated_accounts_df <- data.frame()

for (row in 1:nrow(organisation_list)) {
  affiliated_accounts <- elite_users %>% 
    filter(grepl(organisation_list[row,]$username_match,
                 description,
                 ignore.case = TRUE))
  affiliated_accounts$organisation <- organisation_list[row,]$organisation
  affiliated_accounts_df <- rbind(affiliated_accounts_df,affiliated_accounts)
}

affiliated_accounts_summary <- affiliated_accounts_df %>%
  group_by(organisation) %>%
  summarise(count = n(),
            affiliate_mean_ideal_point = mean(user_ideal_point,na.rm = T),
            affiliate_median_ideal_point = median(user_ideal_point,na.rm = T),
            affiliate_ideal_point_sd = sd(user_ideal_point,na.rm = T))

# Merge the organisation affiliation summary data to the original organisation data
organisation_list <- left_join(organisation_list,affiliated_accounts_summary,by="organisation")

# Finally, merge the organisation data to the guest data by the organisation each guest represents/is affiliated with. This
# gives each guest potentially up to three associated ideal points: their own, their organisation, and the mean of affiliates 
# with their organisation
organisation_ideal_points <- organisation_list %>% select(organisation,organisation_ideal_point,affiliate_mean_ideal_point)

username_list <- left_join(username_list,organisation_ideal_points,by="organisation")

fwrite(username_list, paste0(processed_dir,"username_master_list_with_ideal_points.csv"))

#### ---- END ---- ####